library(tidyverse)
library(kableExtra)
library(DT)
library(tidyr)
library(dplyr)
library(ggplot2)
library(plotly)
library(readxl)
library(tibble)
library(ggthemes)
library(maps)

library(plotly) # Interactive data visualizations
library(viridis) # Color gradients
library(lubridate)
library(qtlcharts) # Interactive scatter plots 

library(randomForest) 
library(ranger)       # a faster implementation of randomForest
library(caret)        # performe many machine learning models
library(broom)  # try: `install.packages("backports")` if diretly installing broom gives an error

library(reshape2) # to use melt()

library(deSolve)
library(permimp) # for calculating conditional importance in a speedy way

1 Guide for RMarkdown report (remove for final report)

1.1 Bibliographies in RMarkdown

  • references.bib: for BibTeX of all sources you wish to reference
  • references-pkg.bib: for BibTeX of all packages used in the report

Bibliographies will be included automatically at the end before appendix. Copy paste bib or BibTeX text from your sources into the file references.bib following the example format, leaving an empty line between entries. Name it however you want in the first line after @document-format{ like in the example. For any package you are using, if there isn’t already an entry for it in the bib file, then use the function citation("package-name") in R console and copy the BibTeX entry from the output, paste it into references-pkg.bib.

Examples of how to reference bib entries in-text (check the rmd file):

  • Garth reference: (Tarr, 2021)
  • tidyverse reference: (Wickham et al., 2019)

1.2 Sections cross-referencing

You can cross-reference to sections, so that the link point to the relevant heading (check the rmd file):

  • use {#section-id} after heading to label the section followed, where section-id is user-defined and acts as the identifier of the section
  • use \@ref(section-id) or \@ref(section-heading) if there is no identifier, to refer to the section you want to link to

Examples (check rmd file):

  • this text refers to the appendix A with the heading “Code” (Appendix A)
  • this text refers to the section labelled with id vri (Section 4.2)
  • this text refers to the section “Data Background” (Section 3)

Note this will only show the number of the section, you have to put “Section”/“Appendix” before the \@ref(...) as well.

1.3 Figures/Tables cross-referencing

  • use \@ref(fig:chunk-name) to refer to figure output by the chunk named as chunk-name
  • use \@ref(tab:chunk-name) to refer to the table output by the chunk named as chunk-name

Here is an example plot with caption and its in-text reference (check rmd file):

Note this will only show the number (default format: section-num.figure-num), you have to put “Figure” before the \@ref(fig:..) as well.

# example plot
mtcars %>% 
  rownames_to_column("car") %>% 
  as_tibble() %>% 
  ggplot() +
  aes(x = factor(cyl), y = disp, colour = factor(cyl)) +
  geom_boxplot()
Example plot

Figure 1.1: Example plot

Here is an example table with caption and its in-text reference:

Note this will only show the number (default format: section-num.table-num), you have to put “Table” before the \@ref(tab:..) as well.

# example table
mtcars %>% 
  kbl(caption = "Example table",
      format = "html",
      booktabs = TRUE,
      row.names = TRUE,
      escape = TRUE) %>%
  kable_styling(bootstrap_options = c("condensed", "striped", "hover"),
                position = "center") %>%
  scroll_box(height = "300px")
Table 1.1: Example table
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2

2 Executive Summary

3 Data Background

covid_data = read_csv("data/covid_data_latest.csv")
covid_data$date = ymd(covid_data$date)

policy = read_csv("data/covid-vaccination-policy.csv")
policy$Day = ymd(policy$Day)


happiness = read_csv("data/happiness-cantril-ladder.csv")
ghs = read_csv("data/2021-GHS-Index-April-2022.csv")
corruption = read_xlsx("data/CPI2020_GlobalTablesTS_210125.xlsx", sheet=1, skip=2)

source("data/ct_model.R")

4 Methods

4.1 Time lag between 1st and 2nd dose

# smoother function, returns smoothed column

Lowess <- function(data, f) {
  lowess_fit <- lowess(data, f = f)
  return(lowess_fit$y)
}
lag_covid = covid_data
lag_covid = lag_covid %>%
  filter(population > 1500000) %>%
  filter(gdp_per_capita > 0)
countrys <- unique(lag_covid$location)
deleted <- c("Afghanistan", "Antigua and Barbuda", "Bangladesh","Benin", "Bhutan", "Bonaire Sint Eustatius and Saba", "Botswana", "Burundi","Burkina Faso", "Cameroon", "Cote d'Ivoire", "Democratic Republic of Congo", "Ethiopia","Eritrea", "Gabon", "Ghana", "Guernsey", "Guinea", "Kenya", "Kuwait", "Liberia", "Laos", "Namibia", "Nepal","Nicaragua", "Niger", "Nigeria", "Palestine", "Philippines", "Pitcairn", "Rwanda", "Saint Helena", "Senegal", "Sierra Leone", "Somalia", "South Sudan", "Sudan", "Tokelau", "Turkmenistan","Tanzania", "Uganda","Yemen", "World", "Zambia")
countrys = countrys[! countrys %in% deleted]
lag_covid = select(lag_covid, "date", "location", "people_vaccinated_per_hundred", "people_fully_vaccinated_per_hundred")
start_date = "2021-02-01"
end_date = "2021-08-01"
lag_covid = lag_covid %>% filter(date >= start_date & date < end_date)
lag_covid$people_vaccinated_per_hundred[is.na(lag_covid$people_vaccinated_per_hundred)] = 0
lag_covid$people_fully_vaccinated_per_hundred[is.na(lag_covid$people_fully_vaccinated_per_hundred)] = 0

lagValue <- function(FirstDose, SecondDose, windowsize, p)
{
  # vector for all measures of distance between matrices
  dist_vector = c()
  i = 1
  while (i <= windowsize){
    # select different subsets of matrices, calculate the distances between the 2 matrices and store the distance. This while loop will contain information for 1st vaccine lag
    FirstDose_subset <- FirstDose[i:nrow(FirstDose),1]
    SecondDose_subset <- SecondDose[1:(1 - i + nrow(SecondDose)),1]
    dist_FirstDose <- proxy::dist(t(FirstDose_subset), t(SecondDose_subset), method = "Minkowski", p = p)
    dist_vector = c(dist_vector, dist_FirstDose)
    i = i + 1
  }
  
  
  j = 1
  while (j <= windowsize){
    # select different subsets of matrices, calculate the distances between the 2 matrices and store the distance. This while loop will contain information for 2nd vaccine lag
    FirstDose_subset1 <- FirstDose[1:(1 - j + nrow(FirstDose)),1]
    SecondDose_subset1 <- SecondDose[j:nrow(SecondDose),1]
    dist_SecondDose <- proxy::dist(t(FirstDose_subset1), t(SecondDose_subset1), method = "Minkowski", p = p)
    dist_vector = c(dist_vector, dist_SecondDose)
    j = j + 1
  }
  
  # select min value index which corresponds to value of the lag
  return(which.min(dist_vector))
}  
lag_vector_1 <- c()
lag_vector_2 <- c()
lag_vector_3 <- c()
# loop through each country and calculate 3 time lags: manhattan distance, euclidean distance and minkowski (p=3) time lags
p = 1
while (p <= 3){
  z = 1
  while (z <= length(countrys)){
    # only select records for certain country and only select 1st and 2nd vaccine columns
    lagCovid_filtered = filter(lag_covid, location == countrys[z])
    combined_matrix <- cbind(lagCovid_filtered[,3], lagCovid_filtered[,4])
    
    # In the dataset, there are missing values. Will replace these missing values (0) with the value from the date before. Do it for both 1st and 2nd vaccine columns
    
    for (i in 1:nrow(combined_matrix)){
      if (i == 1){
      } else{
        if (combined_matrix[i,1] == 0){
          combined_matrix[i,1] = combined_matrix[i-1, 1]
        }
      }
    }
    
    for (j in 1:nrow(combined_matrix)){
      if (j == 1){
      } else{
        if (combined_matrix[j,2] == 0){
          combined_matrix[j,2] = combined_matrix[j-1, 2]
        }
      }
    }
    
    # Apply smoothing function to 1st and 2nd vaccine columns. f = 0.15 is an arbitrary value
    
    combined_matrix_smooth<- as.matrix(apply(combined_matrix, 2, Lowess, f = 0.15))
    
    # Store each column separately as individual matrices
    FirstDose_matrix = as.matrix(combined_matrix_smooth[,1])
    SecondDose_matrix = as.matrix(combined_matrix_smooth[,2])
    
    # Input the individual matrices into the lagValue function to find the lag between the 1st and 2nd dose for a particular country
    lag <- lagValue(FirstDose_matrix, SecondDose_matrix, windowsize=100, p)
    
    #store value of lag
    if (p == 1){
      lag_vector_1 <- c(lag_vector_1, lag)
    } else if (p == 2){
      lag_vector_2 <- c(lag_vector_2, lag)
    } else {
      lag_vector_3 <- c(lag_vector_3, lag)
    }
    z = z + 1
  }
  p = p + 1
}
# label the lag values with the corresponding country
names(lag_vector_1) <- countrys
names(lag_vector_2) <- countrys
names(lag_vector_3) <- countrys
# function to explain the time lag value

lagType <- function(lag, windowsize)
{ # Function to convert indice value given by lagValue to a value for the Time Lag.
  # Any lag values that are greater than windowsize were part of the 2nd half of the 'dist_vector' from the lagValue function, the half of the vector for the 2nd vaccine lag.
  # Therefore need to subtract off all the days from the 1st half of the 'dist_vector' to get number of days for 2nd vaccine lag.
  # No such issue for 1st vaccine lag as all values are within first half.
  if (lag > windowsize){
    return(c(LagType = "Second Dose Lag", Lag = lag - windowsize - 1))
  } else {
    return(c(LagType = "First Dose Lag", Lag = lag - 1))
  }
}
# Apply function to each country's Time lag value 
lag_df_1 = mapply(lagType, lag = lag_vector_1, windowsize = 100)
lag_df_2 = mapply(lagType, lag = lag_vector_2, windowsize = 100)
lag_df_3 = mapply(lagType, lag = lag_vector_3, windowsize = 100)

# Visualise Time lags

total_lag = cbind(t(lag_df_1), t(lag_df_2), t(lag_df_3))
colnames(total_lag) = c("LagType", "Lag: Euclidean distance", "LagType", "Lag: Manhattan distance", "LagType", "Lag: Minkowksi distance (P=3)")

4.2 Vaccine rollout index (VRI) and variable importance

4.2.0.1 Exploration

countries = c("United States", "India", "Brazil", "France", "United Kingdom", "Russia", 
              "Turkey", "Italy", "Germany", "Spain", "Argentina",
                "Iran", "Colombia", "Poland", 'Mexico', "Netherlands", 
              "Indonesia", "Ukraine", "South Africa", "Philippines", "Peru", "Belgium",
                "Czechia", "Japan", "Iran")

policy_data <- covid_data %>% 
  filter(location %in% countries)

policy <- policy %>% filter(Entity %in% countries)

policy_data <- policy_data%>% 
  select(iso_code, continent, location, date,new_people_vaccinated_smoothed_per_hundred,  new_people_vaccinated_smoothed, new_vaccinations, new_vaccinations_smoothed, new_vaccinations_smoothed_per_million, population, people_vaccinated)

policy_data <- merge(policy_data, policy, by.x = c('iso_code', "date", "location"), by.y = c('Code', 'Day', "Entity"))
get_slope <- function(country_data, country_policy, country) {
  i = 1
  country_slope <- data.frame(policy = as.factor(1:5), speed = rep(0, 5), country = rep(country, 5))
  
  while (i <= length(country_policy$vaccination_policy)) {
    
    current_policy = country_policy$vaccination_policy[i]
    
    if (current_policy > 0 & !is.na(country_policy$people_vaccinated[country_policy$vaccination_policy == current_policy])) {
      temp_people <- country_data$people_vaccinated[country_data$vaccination_policy == current_policy & !is.na(country_data$people_vaccinated)] / country_data$population[1]
      
      temp_index <- 1:length(temp_people)
      
      if (length(temp_index) > 1) {
        # Linear model
        mod <- lm(temp_people ~ temp_index)
        
        # Extract and store the slope
        country_slope$speed[country_slope$policy == current_policy] <- summary(mod)$coefficients[2,1]
      } else {
        # Some countries only have a policy for 1 day -> treat the slope as 0
        country_slope$speed[country_slope$policy == current_policy] <- 0
      }
      
    } 
    
    i <- i + 1
    
  }
  
  return(country_slope)
}
country_slope = NULL

for (country in countries) {
  country_data <- policy_data[policy_data$location == country, ]
  
  country_policy <- country_data[!is.na(country_data$people_vaccinated), ] %>%
    group_by(vaccination_policy) %>%
    arrange(date) %>%
    slice(1L) %>%
    select(iso_code, location, date, vaccination_policy, new_vaccinations_smoothed, population, people_vaccinated)

  temp_slope <- get_slope(country_data, country_policy, country)

  country_slope <- rbind(country_slope, temp_slope)
}


global <- ggplot(data = country_slope, aes(x = country, y = speed, fill = as.factor(policy))) + 
  geom_bar(stat="identity", position=position_dodge()) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  scale_y_sqrt() + 
  ggtitle("All countries") + 
  xlab("Countries") + 
  ylab("Slope") + 
  labs(fill = "Avalability stage")

global

4.2.0.2 VRI

5 {r vri-func} # ct_model = function(df, log.y = FALSE, model = c("logis", "asymp", "linear"), model2 = "none") { # # cleaning # covid_clean = df %>% # filter(str_length(iso_code) <= 3) %>% # select(iso_code, location, date, people_vaccinated, population, aged_65_older, gdp_per_capita, cardiovasc_death_rate, population_density, hospital_beds_per_thousand, human_development_index, extreme_poverty, diabetes_prevalence, life_expectancy) %>% # filter(people_vaccinated != 0 & !is.na(people_vaccinated)) %>% # group_by(location) %>% # mutate(t_days = difftime(date, min(date), units = "days") %>% as.integer()) # # # initiate r_list, to store location, r, fit, ... respectively # r_list = list("location" = c(), "r" = c(), "fit" = c(), "y.real" = c(), "date" = c(), "t_days" = c(), "vri_data" = c(), "iso_code" = c()) # # ## formulae, based on selected model, people_vaccinated, and log.y # if (log.y) { # f2 = formula(log(people_vaccinated) ~ t_days) # # models # if (model == "logis") { # f1 = formula(log(people_vaccinated) ~ SSlogis(t_days, Asym, xmid, scal)) # } else if (model == "asymp") { # f1 = formula(log(people_vaccinated) ~ SSasymp(t_days, Asym, R0, lrc)) # } # } else { # f2 = formula(people_vaccinated ~ t_days) # # models # if (model == "logis") { # f1 = formula(people_vaccinated ~ SSlogis(t_days, Asym, xmid, scal)) # } else if (model == "asymp") { # f1 = formula(people_vaccinated ~ SSasymp(t_days, Asym, R0, lrc)) # } # } # # ## for each country (location) # for (loc in unique(covid_clean$location)) { # # subset cleaned data to one country # covid_subset = covid_clean %>% filter(location == loc) %>% ungroup() # iso = unique(covid_subset$iso_code) # # # try to fit model # fit = tryCatch( # # main function, fit asymptote regression model # expr = { # if (model == "linear") { # lm(f2, data = covid_subset) # } else { # nls(f1, data = covid_subset) # } # }, # # if error, fit linear model # error = function(error_message){ # if (model2 == "linear") { # return( lm(f2, data = covid_subset) ) # } else if (model2 == "none") { # return( NULL ) # } # } # ) # # # calculate my r value, may need transform before put into vri formula # if (class(fit) == "lm") { # r = coef(fit)["t_days"] # } else if (class(fit) == "nls") { # if (model == "logis") {scal = coef(fit)["scal"]; r = 1/scal} else # if (model == "asymp") {lrc = coef(fit)["lrc"]; r = exp(lrc)} # } else if (class(fit) == "NULL") { # r = NA # } # # # vri = r * last people_vaccinated / end population # last_entry = tail(covid_subset, 1) # vri = r * last_entry$people_vaccinated / last_entry$population # # median_data = covid_subset %>% # select(-date, -t_days, -location, -iso_code, -people_vaccinated) %>% # summarise(across(everything(), median, na.rm = TRUE)) # # vri_data = median_data %>% # mutate(iso_code = iso, location = loc, vri = vri, .before = 1) # # r_list[[1]] = c(r_list[[1]], loc) # r_list[[2]] = c(r_list[[2]], r) # r_list[[3]] = c(r_list[[3]], list(fit)) # r_list[[4]] = c(r_list[[4]], list(covid_subset$people_vaccinated)) # r_list[[5]] = c(r_list[[5]], list(covid_subset$date)) # r_list[[6]] = c(r_list[[6]], list(covid_subset$t_days)) # r_list[[7]] = c(r_list[[7]], list(vri_data)) # r_list[[8]] = c(r_list[[8]], iso) # } # # ## Measuring model fitness, residual standard error (RSE). # # for (i in seq_len(length(r_list$fit))) { # # if (log.y) { # # # residuals = real - fitted # # resid = r_list$y.real[[i]] - exp(fitted(r_list$fit[[i]])) # # # degrees of freedom = n - k - 1, k = number of parameters # # df = summary(r_list$fit[[i]])$df[2] # # r_list$rse[[i]] = (sum(resid^2)/df)^(1/2) # # } else { # # r_list$rse[[i]] = sigma(r_list$fit[[i]]) # # } # # } # # ## Measuring model fitness, Mean Absolute Scaled Error (MASE). # # reference: https://people.duke.edu/~rnau/compare.htm # # https://www.rdocumentation.org/packages/Metrics/versions/0.1.4/topics/mase # for (i in seq_len(length(r_list$fit))) { # r_list$mase[[i]] = Metrics::mase(r_list$y.real[[i]], fitted(r_list$fit[[i]]), step_size = 1) # } # # return(r_list) # } #

# Example of use, inspect the r_list to see what's included
r_list1 = ct_model(covid_data, log.y = FALSE, model = "logis")

r_list2 = ct_model(covid_data, log.y = TRUE, model = "asymp")

vri_data <- data.frame(bind_rows(r_list1$vri_data))
head(vri_data)
# Merge the datasets
corruption <- corruption %>% select(c("ISO3", "CPI score 2020")) 

happiness <- happiness %>% group_by(Code) %>%
  arrange(Year) %>%
  filter(!is.na(`Life satisfaction in Cantril Ladder (World Happiness Report 2021)`)) %>%
  dplyr::summarise(satisfaction=last(`Life satisfaction in Cantril Ladder (World Happiness Report 2021)`)) 


joint_data <- merge(vri_data,corruption,  by.x=c("iso_code"), by.y=c("ISO3"))

joint_data <- merge(joint_data,happiness, by.x=c("iso_code"), by.y=c("Code"))

# GHE-INDEX
ghs <- ghs %>% filter(Year == 2021) %>% select(Country,`OVERALL SCORE`)

## adding iso code to ghs dataset
ghs <- ghs %>% mutate(iso_code = iso.alpha(Country, 3), .before = 1)
ghs <- ghs %>% mutate(
  iso_code = case_when(
    !is.na(iso_code) ~ iso_code,
    Country == "Bosnia and Hercegovina" ~ "BIH",
    Country == "Cabo Verde" ~ "CPV",
    Country == "Congo (Brazzaville)" ~ "COG",
    Country == "Congo (Democratic Republic)" ~ "COD",
    Country == "Côte d'Ivoire" ~ "CIV",
    Country == "eSwatini" ~ "BIH",
    Country == "Kyrgyz Republic" ~ "KGZ",
    Country == "São Tomé and Príncipe" ~ "STP",
    Country == "St Kitts & Nevis" ~ "KNA",
    Country == "St Lucia" ~ "LCA",
    Country == "St Vincent & The Grenadines" ~ "VCT",
    Country == "United Kingdom" ~ "GBR",
    Country == "United States of America" ~ "USA"
  )
)


joint_data <- merge(joint_data, ghs[,-2],  by.x=c("iso_code"), by.y=c("iso_code"))
colnames(joint_data)[16] <- "GHS_score"
# take log for highly skewed variables
vri_extra_logged = joint_data %>% select(-population) %>%
  mutate(
    across(.cols = c(population_density, aged_65_older, gdp_per_capita, hospital_beds_per_thousand, cardiovasc_death_rate, extreme_poverty, diabetes_prevalence),
           ~log(.))
  ) %>% 
  rename_with(.cols = c(population_density, aged_65_older, gdp_per_capita, hospital_beds_per_thousand, cardiovasc_death_rate, extreme_poverty, diabetes_prevalence),
              .fn = ~paste0("log_", .))


min_max_norm <- function(x) {
    (x - min(x,na.rm = TRUE)) / (max(x,na.rm = TRUE) - min(x,na.rm = TRUE))
}

scaled_data <- data.frame(lapply(vri_extra_logged[4:15], min_max_norm), vri = vri_extra_logged$vri)

scaled_data_cleaned <- scaled_data %>% filter( !is.na(vri) & !is.na(log_population_density)& 
                                    !is.na(log_gdp_per_capita) &!is.na(log_aged_65_older)&
                                      !is.na(log_extreme_poverty)&
                                      !is.na(human_development_index)& 
                                      !is.na(log_cardiovasc_death_rate)& 
                                      !is.na(log_hospital_beds_per_thousand)) %>% 
                                 select(c(vri,log_gdp_per_capita,log_aged_65_older,log_population_density,
                                     CPI.score.2020,log_extreme_poverty,
                                     satisfaction,life_expectancy,human_development_index,
                                     log_cardiovasc_death_rate,log_diabetes_prevalence,
                                     log_hospital_beds_per_thousand, GHS_score)) 
# Appendix

spearman <- cor(scaled_data_cleaned, use="pairwise.complete.obs", method="spearman")

qtlcharts::iplotCorr(
  scaled_data_cleaned,
  reorder=TRUE,
  corr = spearman,
  chartOpts = list(cortitle = "Spearman's correlation")
)
# Remove population_density based on scatter plot and correlation heatmap

scaled_data_cleaned <- scaled_data_cleaned %>% select(-log_population_density)
# hyper parameter grid search (definitely need a bit modify)
mtry <-  seq(2, 6, by = 1)
num_trees <- c(100,150,200,250,300,350,400,450,500)


# Manual Search
#control <- trainControl(method="cv", number=3, search="grid")
grid_param <- expand.grid(.mtry=mtry)
modellist <- list()
set.seed(388)
for (ntree in num_trees) {
    fit <- train( vri~., 
                      data= scaled_data_cleaned, 
                      method='rf', 
                      tuneGrid=grid_param, 
                        ntree= ntree,
                        importance=TRUE,
                      trControl=trainControl(method='cv', 
                        number=5) )
    key <- toString(ntree)
    modellist[[key]] <- fit$finalModel

}


## COMPARE RESULTS

lowest_mse <- 1
model_mse <- modellist[[1]]
highest_r2 <- 0
model_r2 <- modellist[[1]]
  
for (i in c(1:length(modellist))) {

  result <- predict(modellist[[i]], newdata=scaled_data_cleaned %>% select(-vri))
  result_avg <- mean(scaled_data_cleaned$vri)
  mse = mean((scaled_data_cleaned$vri - result)^2)
  r2 = 1 - (sum((scaled_data_cleaned$vri - result)^2))/(sum((scaled_data_cleaned$vri - result_avg)^2))
  if (highest_r2 < r2){
     highest_r2 = r2
     model_r2 = modellist[[i]]
  }
  if (lowest_mse > mse) {
    lowest_mse = mse
    model_mse = modellist[[i]]
  }

 
}

# Calculating importance of features to the model: Handling multicollinear features by using conditional permutation

## https://cran.r-project.org/web/packages/permimp/vignettes/permimp-package.html

set.seed(388)
optimal_rf <- randomForest(vri ~ ., data= scaled_data_cleaned, mtry = 2, keep.forest = TRUE, keep.inbag = FALSE, ntree= 200)
## compute permutation importance
rf_permimp <- permimp(optimal_rf, progressBar = FALSE, conditional = TRUE,scaled = TRUE, thresholdDiagnostics = TRUE,type="response",do_check = FALSE)

vimp <- as.data.frame(rf_permimp$values)
colnames(vimp)[1] <- "importance"
vimp <- round(vimp, 4)
vimp$var <- row.names(vimp)

options(scipen=999)
vimp <- vimp %>%
  arrange(desc(importance)) %>%
  slice(1:5)


vimp %>%
  ggplot(aes(reorder(var,importance), importance)) +
  geom_col(fill="steelblue") +
  coord_flip() +
  theme_bw()+
  ggtitle("Top 5 important variables") +
  xlab("Factors in order") +
  ylab("Scaled importance")

6 Results

datatable(total_lag, options = list(columnDefs = list(list(className = 'dt-center', targets = "_all"))))
# Box plots of time lags for all 3 types of time lags measurements

total_lag_df <- as.data.frame(total_lag)
colnames(total_lag_df) = c("LagType", "LagEuclideanDistance", "LagType", "LagManhattanDistance","LagType", "LagMinkowskiDistanceP3")

box <- plot_ly(type = "box")
box <- box %>% add_boxplot(x = total_lag_df$LagManhattanDistance, boxpoints = "all", jitter = 0.3, name = "Manhattan Distance")
box <- box %>% add_boxplot(x = total_lag_df$LagEuclideanDistance, boxpoints = "all", jitter = 0.3, name = "Euclidean Distance")
box <- box %>% add_boxplot(x = total_lag_df$LagMinkowskiDistanceP3, boxpoints = "all", jitter = 0.3, name = "Minkowski Distance (P=3)")
box <- box %>% layout(title = "Box Plot of Lag Between 1st and 2nd Dose")
box

7 Discussion & Conclusion

8 Student Contributions

9 References

Tarr, G. (2021). DATA2002 data analytics: Learning from data. Unpublished Work, University of Sydney, Sydney Australia.
Wickham, H., Averick, M., Bryan, J., Chang, W., McGowan, L. D., François, R., … Yutani, H. (2019). Welcome to the tidyverse. Journal of Open Source Software, 4(43), 1686. https://doi.org/10.21105/joss.01686

Appendix

A Code

labs = knitr::all_labels()
# exclude chunk "setup" and "get-labels" (this chunk)
labs = setdiff(labs, c("setup", "get-labels"))
# can be used later if want all code in one chunk
# example plot
mtcars %>% 
  rownames_to_column("car") %>% 
  as_tibble() %>% 
  ggplot() +
  aes(x = factor(cyl), y = disp, colour = factor(cyl)) +
  geom_boxplot()